library(readr)
rent <- read_csv("student_predictions.csv", 
col_types = cols(
  `Posted On` = col_date(format = "%m/%d/%Y"),
  Size = col_integer(), 
  Unit.Floor = col_integer(), 
  Total.Floors = col_integer()
  )
)
colnames(rent)
##  [1] "Posted On"         "BHK"               "Size"             
##  [4] "Unit.Floor"        "Total.Floors"      "Area.Type"        
##  [7] "City"              "Area Locality"     "Furnishing.Status"
## [10] "Tenant.Preferred"  "Bathroom"          "Point.of.Contact" 
## [13] "Rent"              "Marie"             "Lucas"            
## [16] "Grace"             "Bethany"           "Jenna"            
## [19] "Emma"              "Natalie"           "Hannah"           
## [22] "Chris"             "Jacob"
predictions <- rent[, -(1:13)]
head(predictions)
## # A tibble: 6 × 10
##    Marie  Lucas Grace Bethany  Jenna   Emma Natalie Hannah Chris  Jacob
##    <dbl>  <dbl> <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl> <dbl>  <dbl>
## 1  5667.  -857.  4100   9095.  5818.  5339.   8881.  5254.  6463  5258.
## 2  8232. 11922.  8300   9215.  8604.  8231.   8881.  8065.  8782  8230.
## 3 34172. 49947. 37000  15422. 44414. 29004.  28625. 36119. 31304 33567.
## 4  5463.  5221.  6700   8688.  6917.  6197.   8881.  6088.  7340  6240.
## 5 15285. 44453. 14500  17856. 15088. 14840.  15597. 14900. 15266 14797.
## 6  7238. 10185.  7700   9388.  8189.  7896.   8881.  7732.  8469  7838.
summary(predictions)
##      Marie            Lucas            Grace           Bethany       
##  Min.   :  3380   Min.   : -1378   Min.   :  2700   Min.   :      0  
##  1st Qu.: 11167   1st Qu.: 22961   1st Qu.: 12100   1st Qu.:  11280  
##  Median : 16141   Median : 41226   Median : 15550   Median :  17152  
##  Mean   : 32248   Mean   : 47279   Mean   : 31518   Mean   :  31105  
##  3rd Qu.: 32527   3rd Qu.: 60849   3rd Qu.: 31450   3rd Qu.:  26773  
##  Max.   :581209   Max.   :189988   Max.   :468700   Max.   :1278290  
##      Jenna             Emma           Natalie           Hannah      
##  Min.   :  5226   Min.   :  4159   Min.   :  8881   Min.   :     0  
##  1st Qu.: 11613   1st Qu.: 11291   1st Qu.: 13090   1st Qu.: 11511  
##  Median : 15921   Median : 15477   Median : 15598   Median : 15765  
##  Mean   : 32503   Mean   : 32277   Mean   : 36880   Mean   : 31634  
##  3rd Qu.: 32623   3rd Qu.: 32165   3rd Qu.: 31692   3rd Qu.: 32527  
##  Max.   :796643   Max.   :776983   Max.   :609500   Max.   :660605  
##      Chris             Jacob       
##  Min.   :   6393   Min.   :  4133  
##  1st Qu.:  11235   1st Qu.: 11366  
##  Median :  15266   Median : 15515  
##  Mean   :  34211   Mean   : 32170  
##  3rd Qu.:  31049   3rd Qu.: 32236  
##  Max.   :1879887   Max.   :818594

Missing value imputation

library(dplyr)
median_rent <- median(rent$Rent)

predictions <- predictions %>%
  mutate(
    Lucas = ifelse(
      Lucas < 0,
      median_rent,
      Lucas
    )
  ) %>%
  mutate(
    Bethany = ifelse(
      Bethany < 1,
      median_rent,
      Bethany
    )
  ) %>%
  mutate(
    Hannah = ifelse(
      Hannah < 1,
      median_rent,
      Hannah
    )
  ) 

summary(predictions)
##      Marie            Lucas               Grace           Bethany       
##  Min.   :  3380   Min.   :    11.24   Min.   :  2700   Min.   :   7853  
##  1st Qu.: 11167   1st Qu.: 22960.78   1st Qu.: 12100   1st Qu.:  11531  
##  Median : 16141   Median : 41225.89   Median : 15550   Median :  17152  
##  Mean   : 32248   Mean   : 47417.04   Mean   : 31518   Mean   :  31238  
##  3rd Qu.: 32527   3rd Qu.: 60849.42   3rd Qu.: 31450   3rd Qu.:  26773  
##  Max.   :581209   Max.   :189987.58   Max.   :468700   Max.   :1278290  
##      Jenna             Emma           Natalie           Hannah      
##  Min.   :  5226   Min.   :  4159   Min.   :  8881   Min.   :  4460  
##  1st Qu.: 11613   1st Qu.: 11291   1st Qu.: 13090   1st Qu.: 11565  
##  Median : 15921   Median : 15477   Median : 15598   Median : 15818  
##  Mean   : 32503   Mean   : 32277   Mean   : 36880   Mean   : 31678  
##  3rd Qu.: 32623   3rd Qu.: 32165   3rd Qu.: 31692   3rd Qu.: 32527  
##  Max.   :796643   Max.   :776983   Max.   :609500   Max.   :660605  
##      Chris             Jacob       
##  Min.   :   6393   Min.   :  4133  
##  1st Qu.:  11235   1st Qu.: 11366  
##  Median :  15266   Median : 15515  
##  Mean   :  34211   Mean   : 32170  
##  3rd Qu.:  31049   3rd Qu.: 32236  
##  Max.   :1879887   Max.   :818594

Calculating RMSE

library(Metrics)
actual_rent_log <- log10(rent$Rent)
predictions <- log10(as.matrix(predictions))

RMSEs <- apply(
  predictions, 
  MARGIN = 2, 
  FUN = function(x) { 
    Metrics::rmse(actual_rent_log, x) 
  }
)

RMSE <- round(sort(RMSEs), 4)
df <- as.data.frame(RMSE)
df$name <- names(RMSE)

# preserve the order in plot
df$name <- factor(df$name, levels = rev(df$name))
library(ggplot2)
ggplot(df, aes(x = name, y = RMSE)) +
  geom_segment(
    aes(x = name, xend = name, y = 0, yend = RMSE), 
    color = "gray", 
    lwd = 2
  ) +
  geom_point(
    size = 4,
    pch = 21, 
    bg = "red", 
    col = "red"
  ) +
  geom_text(
    aes(label = RMSE), 
    color = "blue", 
    size = 3, 
    nudge_y = .02
  ) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    axis.title.y = element_blank()
  )+
  coord_flip()

Marie’s model

library(mgcv)
library(nlme)

gen.am <- gam(
  logRent ~ s(logSize,k=11) + City + Furnishing.Status + Bathroom 
    + BHK + Area.Type + Point.of.Contact +Total.Floors, 
  data = rent.practice
)
gen.am
#R-sq.(adj) =  0.821 

Hannah’s model

fit2 <- lm(logrent ~ BHK + Size + City + `Furnishing Status` 
           + Bathroom  +`Point of Contact`, 
           data = rent2
)
summary(fit2)
#Adjusted R-squared:  0.8099 

Emma’s model

model <- lm(Log.Rent ~ City + BHK + Size + Tenant.Preferred + 
              Furnishing.Status + Point.of.Contact + Num.Floors, 
           data = rent5)
#Adjusted R-squared:  0.8119 

Jacob’s model

lm <- lm(rent.log$Rent ~ rent.log[, 2] + rent.log[, 3] + 
           rent.log[, 7] + rent.log[, 8] + rent.log[, 11])

#Size + City + BHK + Contact + Furnishing Status  ???
# Adjusted R-squred: 0.8056 

Ensemble approach (many models vote)

predictions_subset <- subset(
  predictions,
  select = c(Marie, Hannah, Emma, Jacob)
)
                      
ensembl_prediction <- apply(
  predictions_subset,
  MARGIN = 1,
  FUN = mean
)
Metrics::rmse(actual_rent_log, ensembl_prediction)
## [1] 0.1638336
library(corrplot)
df <- cbind(predictions, rent = actual_rent_log)
corrplot.mixed(cor(df), order = "AOE")

dist_pearson <- function(x, ...)
  as.dist(1-cor(t(x), method="pearson"))
plot(
  hclust(
    dist_pearson(t(df)),
    method = "average"
  ),
  xlab = NULL
)

library(reshape2)
wide_data = cbind(t(df), names = colnames(df))
long_data <- melt(wide_data, id.vars = c("names") )
colnames(long_data)[1] = "name"
head(long_data)
##      name Var2            value
## 1   Marie      3.75332455600047
## 2   Lucas      4.21748394421391
## 3   Grace      3.61278385671974
## 4 Bethany      3.95881311230356
## 5   Jenna      3.76477910405509
## 6    Emma      3.72742909811498
long_data$value <- as.numeric(long_data$value)

ggplot(long_data, aes(x=name, y=value, color = name, fill = name)) +
  geom_violin(trim=FALSE, show.legend = FALSE) 

ggplot(long_data, aes(x=name, y=value, color = name, fill = name)) +
  geom_violin(trim=FALSE, show.legend = FALSE) +
  ylim(3.5, 6)

Factors

Factors Marie Hannah Emma Jacob5 Jacob6 Jacob7
Size O O O O O O
City O O O O O O
Contact O O O O O O
BHK O O O O O O
Bathroom O O - O O O
Furnishing O O O - O O
Total.Floors O - O - - O
Area.Type O - - - - -
Tenant - - O - - -
data_all <- cbind(
  rent[, 1:12], 
  Rent = log10(rent$Rent), 
  predictions
)
#write.csv(data_all, "model_performance.csv", row.names = FALSE)
  ggplot(data_all,
         aes(
           x = Rent,
           y = Marie,
           color = City
         )) +
    geom_point() +
    xlim(3.4, 5.7) +
    ylim(3.4, 5.7) +
    xlab("Actual Rent (log10)") + 
    geom_abline(intercept = 0, slope = 1, size = 0.5)

  library(plotly)
  p <- ggplot(data_all,
         aes(
           x = Rent,
           y = Marie,
           color = City
         )) +
    geom_point() +
    xlim(3.4, 5.7) +
    ylim(3.4, 5.7) +
    xlab("Actual Rent (log10)") + 
    geom_abline(intercept = 0, slope = 1, size = 0.5)
  
  ggplotly(p)